library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)
library(usmap)
library(lmtest)
library(pracma)
library(lmtest)
library(forecast)
library(vars)
library(rvest)
library(RSelenium)
library(seleniumPipes)
library(dLagM)
library(jsonlite)
library(rgdal)
library(esri2sf)
library(readr)

options(
  tigris_class = "sf",
  tigris_use_cache = TRUE
)

Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")

# Load the processing functions. This also loads the normalization functions.
source("safegraph_process_patterns_functions.R")

# SET your path here to the covid19analysis folder.
sg_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/'

sg_hourly_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/weekly-patterns/v2/hourly/'

Case data

Load SFC and Alameda case data.

# block groups
sf_blockgroups <-
  block_groups("CA","San Francisco", cb=F, progress_bar=F) %>%
  st_transform('+proj=longlat +datum=WGS84')

ac_blockgroups <-
  block_groups("CA","Alameda", cb=F, progress_bar=F) %>%
  st_transform('+proj=longlat +datum=WGS84')

sf_bgs <- sf_blockgroups %>% pull(GEOID)

ac_bgs <- ac_blockgroups %>% pull(GEOID)

# zip code areas
zctas_94 <- 
  zctas(cb = F, starts_with = "94") %>% 
  st_transform('+proj=longlat +datum=WGS84') %>%
  dplyr::select(ZCTA5CE10, geometry)

zctas_95 <- 
  zctas(cb = F, starts_with = "95") %>% 
  st_transform('+proj=longlat +datum=WGS84') %>%
  dplyr::select(ZCTA5CE10, geometry)

zctas_94_95 <- rbind(zctas_94, zctas_95)

# join with block groups
sf_bg_zctas <- sf_blockgroups %>% 
  st_centroid() %>%
  st_join(zctas_94_95) %>% 
  dplyr::select(GEOID, ZCTA5CE10) %>%
  rename(blockgroup = GEOID, zipcode = ZCTA5CE10)

ac_bg_zctas <- ac_blockgroups %>% 
  st_centroid() %>%
  st_join(zctas_94_95) %>% 
  dplyr::select(GEOID, ZCTA5CE10) %>%
  rename(blockgroup = GEOID, zipcode = ZCTA5CE10)


# get SF case data
sf_place_cases <- 
  read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv") %>%
  filter(county == 'San Francisco')

# get population data for San Francisco
acs_vars = readRDS("/Users/simonespeizer/CEE 218Z/censusData2018_acs_acs5.rds")

# define a function for pulling census data
pullCensus <- function(variableToPull, county) {
    regionString <- paste0("state:06+county:", county)
    censusData <- getCensus(
      name = "acs/acs5",
      vintage = 2018,
      region = "block group:*", 
      regionin = regionString,
      vars = variableToPull
    ) %>%
    mutate(blockgroup = paste0(state,county,tract,block_group)) %>%
      select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME"))
  
  return(censusData)
}

sf_fips <- fips("CA", "San Francisco") %>% substr(3,5)

sf_pop_zip <- pullCensus("B01003_001E", sf_fips) %>% 
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_pop_zip = sum(B01003_001E))

sf_cases_zip <- sf_place_cases %>% left_join(sf_pop_zip, by = c("place" = "zipcode")) %>%
  mutate(cases_by_pop = confirmed_cases / total_pop_zip) %>%
  rename(zipcode = place)

# get Alameda County case data - downloaded manually
ac_place_cases <- read.csv("Simone_Speizer/Alameda_County_Cumulative_Cases_By_Place_And_Zip.csv")

ac_cases_zip <- ac_place_cases %>% 
  rename(date = DtCreate) %>%
  mutate(date = date %>%  substr(1,10) %>% as.Date()) %>%
  dplyr::select(c(date, contains("F9"))) %>% # only select zip code data
  gather(key = "zipcode", value = "cases", -date) %>%
  mutate(cases = ifelse(cases == "<10", "5", cases), 
         zipcode = zipcode %>% substr(2,6)) %>% # replace cases <10 with 10 and remove the "F" from zipcode names
  mutate(cases = as.numeric(cases))

# get Alameda County populations by zip code
ac_fips <- fips("CA", "Alameda") %>% substr(3,5)

ac_pop_zip <- pullCensus("B01003_001E", ac_fips) %>% 
  left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_pop_zip = sum(B01003_001E))

ac_cases_zip <- ac_cases_zip %>% left_join(ac_pop_zip) %>%
  mutate(cases_by_pop = cases / total_pop_zip)

# plot Alameda County case data
ac_cases_zip %>% plot_ly() %>%
  add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
  layout(title = "Alameda County")
# also plot SF for comparison
sf_cases_zip %>% plot_ly() %>%
  add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
  layout(title = "San Francisco County")
# plot Alameda and SF together
combined_cases_zip <- rbind(cbind(ac_cases_zip, county_name = "Alameda") %>% dplyr::select(date, cases_by_pop, zipcode, county_name), cbind(sf_cases_zip, county_name = "San Francisco") %>% dplyr::select(date, cases_by_pop, zipcode, county_name))

combined_cases_zip %>% plot_ly() %>%
  add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'lines', color = ~zipcode, linetype = ~county_name) %>%
  layout(title = "San Francisco and Alameda Counties")

Map the cases by zip code for the two counties.

curr_cases_sf_ac <- combined_cases_zip %>% filter((date == max(date) & county_name == "San Francisco") | (date == (max(date) - 2)) & county_name == "Alameda")

pal_1 <-
  colorNumeric(
    palette = "Blues",
    domain =
      c(0,curr_cases_sf_ac %>%
      pull(cases_by_pop) %>%
      unique())
  )

leaflet() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(
    data = curr_cases_sf_ac %>% left_join(zctas_94_95, by = c("zipcode" = "ZCTA5CE10")) %>% st_as_sf() %>% st_transform(4326),
    fill = TRUE,
    fillColor = ~pal_1(cases_by_pop),
    color = "white",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.5,
    group = "Cases per person",
    label = curr_cases_sf_ac$cases_by_pop*1000 %>% round(4)
  ) %>%
  addLayersControl(
    overlayGroups = c("Cases per 1000 people")
  )

Visits and social distancing data

Load SFC and Alameda social distancing data.

bay_sd <- readRDS("/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/bay_socialdistancing_v2.rds") %>% 
  mutate(date = date_range_start %>%  substr(1,10) %>% as.Date())

# relevant dates
pre_sd_date_cutoff <- as.Date("2020-03-01")
shelter_date <- as.Date("2020-03-17") # note that this is the day the order went into effect

# get daily social distancing data for SF by date and zip code, including mean at home time
# first get block group populations for weighted averages
sf_pop_bg <- pullCensus("B01003_001E", sf_fips) %>% 
  rename(total_pop = B01003_001E)
ac_pop_bg <- pullCensus("B01003_001E", ac_fips) %>% 
  rename(total_pop = B01003_001E)

sf_sd_zip_by_date <- bay_sd %>%
  filter(origin_census_block_group %in% sf_bgs) %>%
  left_join(sf_pop_bg, by = c("origin_census_block_group" = "blockgroup")) %>%
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
  group_by(zipcode, date) %>%
  summarize(total_at_home_zip = sum(completely_home_device_count), total_devices = sum(device_count), not_home_time_wtavg = weighted.mean(mean_non_home_dwell_time, total_pop), home_time_wtavg = weighted.mean(mean_home_dwell_time, total_pop), median_not_home_time_wtavg = weighted.mean(median_non_home_dwell_time, total_pop), median_home_time_wtavg = weighted.mean(median_home_dwell_time, total_pop)) %>%
  mutate(percent_at_home = total_at_home_zip*100/total_devices,
         percent_leaving_home = (100 - percent_at_home)) %>%
  ungroup() %>%
  left_join(sf_pop_zip)

# get daily social distancing data for Alameda County by date
ac_sd_zip_by_date <- bay_sd %>%
  filter(origin_census_block_group %in% ac_bgs) %>%
  left_join(ac_pop_bg, by = c("origin_census_block_group" = "blockgroup")) %>%
  left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
  group_by(zipcode, date) %>%
  summarize(total_at_home_zip = sum(completely_home_device_count), total_devices = sum(device_count), not_home_time_wtavg = weighted.mean(mean_non_home_dwell_time, total_pop), home_time_wtavg = weighted.mean(mean_home_dwell_time, total_pop), median_not_home_time_wtavg = weighted.mean(median_non_home_dwell_time, total_pop), median_home_time_wtavg = weighted.mean(median_home_dwell_time, total_pop)) %>%
  mutate(percent_at_home = total_at_home_zip*100/total_devices,
         percent_leaving_home = (100 - percent_at_home)) %>%
  ungroup() %>%
  left_join(ac_pop_zip)

Out of curiosity, let’s look at the average number of devices recorded in each zip code for each week over time, starting the last half of February.

sf_devices_zip_by_week <- sf_sd_zip_by_date %>% 
  filter(date >= as.Date("2020-02-15")) %>%
  filter(!is.na(zipcode) & !is.na(total_devices)) %>%
  mutate(week = floor(((date - min(date)) / 7)) + 1) %>%
  group_by(week, zipcode) %>%
  summarize(avg_daily_devices = mean(total_devices)) %>% ungroup() %>%
  mutate(week_start = as.Date("2020-02-15") + (week - 1)*7)

ac_devices_zip_by_week <- ac_sd_zip_by_date %>% 
  filter(date >= as.Date("2020-02-15")) %>%
  filter(!is.na(zipcode) & !is.na(total_devices)) %>%
  mutate(week = floor(((date - min(date)) / 7)) + 1) %>%
  group_by(week, zipcode) %>%
  summarize(avg_daily_devices = mean(total_devices)) %>% ungroup() %>%
  mutate(week_start = as.Date("2020-02-15") + (week - 1)*7)

# plot 
sf_devices_zip_by_week %>% plot_ly() %>%
  add_trace(x = ~week_start, y = ~avg_daily_devices, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
  layout(title = "San Francisco, average daily devices recorded and week since 2/15")
ac_devices_zip_by_week %>% plot_ly() %>%
  add_trace(x = ~week_start, y = ~avg_daily_devices, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
  layout(title = "Alameda, average daily devices recorded and week since 2/15")

Interesting that there is a drop right after SIP.

Also look just at daily device counts.

sf_sd_zip_by_date %>% plot_ly() %>%
  add_trace(x = ~date, y = ~total_devices, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
  layout(title = "San Francisco, daily devices recorded")
ac_sd_zip_by_date %>% plot_ly() %>%
  add_trace(x = ~date, y = ~total_devices, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
  layout(title = "Alameda, daily devices recorded")

Excluding zip codes associated with universities, under the hypothesis that these might lead to weird results, in Alameda.

Extract the university results to be able to look at them, but then remove them from the general data. These are 94720, 94704, 94709 (UC Berkeley) and 94613 (Mills College) 94542 (CUESB).

First just look at the zip codes I am extracting, then look at their medians.

# extract university related ones, and look at the median measures and see how they change
univ_zips = c("94720","94704","94709","94613","94542")

mapview(zctas_94_95 %>% filter(ZCTA5CE10 %in% univ_zips), layer.name = "university zips")
ac_sd_zip_by_date_univ <- ac_sd_zip_by_date %>% filter(zipcode %in% univ_zips)

ac_sd_zip_by_date_univ %>% plot_ly() %>%
  add_trace(x = ~date, y = ~median_not_home_time_wtavg, color = ~zipcode, type = 'scatter', mode = 'lines') %>% layout(title = "Zip Code Weighted Average of Median Time Outside Home for University-Related Zip Codes")
# remove university related ones 
ac_sd_zip_by_date_no_univ <- ac_sd_zip_by_date %>% filter(!(zipcode %in% univ_zips))

Yep, they definitely have jumps at the new months. Out of curiosity let’s see how their number of devices changes over time.

ac_sd_zip_by_date_univ %>% plot_ly() %>%
  add_trace(x = ~date, y = ~total_devices, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
  layout(title = "University-related zip codes, daily devices recorded")

Note that the ones with the biggest changes in medians (94720, 94613, 94704) either have really small device counts, meaning a small change in the devices recorded could change the results a lot, or have huge changes in devices recorded with each new month.

Compare to non-university related ones:

ac_sd_zip_by_date_no_univ %>% plot_ly() %>%
  add_trace(x = ~date, y = ~total_devices, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
  layout(title = "Non-university-related zip codes, daily devices recorded")

Some of these do have big jumps too, though.

Load SF and Alameda visits data.

# read data on visits
sf_daily_visits_zip <- readRDS(paste0(sg_path, "weekly-patterns/v2/sf_zip_visits_daily_sum_hourly_03-09-20_05-24-20.rds"))

ac_daily_visits_zip <- readRDS(paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_daily_sum_hourly_03-09-20_05-24-20.rds"))

# combine visits and cases and social distancing data and calculate visits per person
sf_sd_visits_cases_zip_by_date <- right_join(sf_sd_zip_by_date %>% dplyr::select(-total_pop_zip), sf_daily_visits_zip) %>% left_join(sf_cases_zip %>% dplyr::select(-total_pop_zip)) %>% 
  left_join(sf_pop_zip) %>%
  mutate(visits_per_capita = total_visits_avg / total_pop_zip)

ac_sd_visits_cases_zip_by_date <- right_join(ac_sd_zip_by_date_no_univ %>% dplyr::select(-total_pop_zip), ac_daily_visits_zip) %>% left_join(ac_cases_zip %>% dplyr::select(-total_pop_zip)) %>% 
  left_join(ac_pop_zip) %>%
  mutate(visits_per_capita = total_visits_avg / total_pop_zip)

Look at visits over time.

sf_sd_visits_cases_zip_by_date %>% plot_ly() %>%
  add_trace(x = ~date, y = ~visits_per_capita, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
  layout(title = "SFC visits per capita over time")
ac_sd_visits_cases_zip_by_date %>% plot_ly() %>%
  add_trace(x = ~date, y = ~visits_per_capita, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
  layout(title = "Alameda visits per capita over time")

Also look at university-related visits data.

ac_sd_visits_cases_zip_by_date %>% filter(zipcode %in% univ_zips) %>% 
  plot_ly() %>%
  add_trace(x = ~date, y = ~visits_per_capita, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
  layout(title = "University-related zip codes visits per capita over time")

These don’t look particularly weird when looking at visits data. Compare to non-university ones.

ac_sd_visits_cases_zip_by_date %>% filter(!(zipcode %in% univ_zips)) %>% 
  plot_ly() %>%
  add_trace(x = ~date, y = ~visits_per_capita, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
  layout(title = "Non-university-related zip codes visits per capita over time")

There might be a little more variation in the university ones. The 94720 data is also an outlier in how low the per-capita daily visits are.

Comparing visits and the other metrics.

Comparing visits and % leaving home, with each point on the scatter plot a zip code and a specific date.

# plot visits and % leaving home
sf_sd_visits_cases_zip_by_date %>% 
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~percent_leaving_home, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "San Francisco visits per capita and percent leaving home")
ac_sd_visits_cases_zip_by_date %>% 
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~percent_leaving_home, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "Alameda visits per capita and percent leaving home")

Repeat with a couple specific dates. All are Mondays.

sf_sd_visits_cases_zip_by_date %>% filter(date == as.Date("2020-03-09")) %>% 
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~percent_leaving_home, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "San Francisco visits per capita and percent leaving home, 3/9")
ac_sd_visits_cases_zip_by_date %>% filter(date == as.Date("2020-03-09")) %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~percent_leaving_home, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "Alameda visits per capita and percent leaving home, 3/9")
sf_sd_visits_cases_zip_by_date %>% filter(date == as.Date("2020-03-23")) %>% 
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~percent_leaving_home, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "San Francisco visits per capita and percent leaving home, 3/23")
ac_sd_visits_cases_zip_by_date %>% filter(date == as.Date("2020-03-23")) %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~percent_leaving_home, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "Alameda visits per capita and percent leaving home, 3/23")
sf_sd_visits_cases_zip_by_date %>% filter(date == as.Date("2020-05-18")) %>% 
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~percent_leaving_home, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "San Francisco visits per capita and percent leaving home, 5/18")
ac_sd_visits_cases_zip_by_date %>% filter(date == as.Date("2020-05-18")) %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~percent_leaving_home, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "Alameda visits per capita and percent leaving home, 5/18")

Repeat with pre and post SIP. Pre is just 3/9-3/16.

sf_sd_visits_cases_zip_by_date %>% filter(date < shelter_date) %>% 
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~percent_leaving_home, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "San Francisco visits per capita and percent leaving home, pre SIP")
sf_sd_visits_cases_zip_by_date %>% filter(date >= shelter_date) %>% 
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~percent_leaving_home, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "San Francisco visits per capita and percent leaving home, post SIP")
ac_sd_visits_cases_zip_by_date %>% filter(date < shelter_date) %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~percent_leaving_home, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "Alameda visits per capita and percent leaving home, pre SIP")
ac_sd_visits_cases_zip_by_date %>% filter(date >= shelter_date) %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~percent_leaving_home, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "Alameda visits per capita and percent leaving home, post SIP")

Interesting that SF seems to have a difference pre and post SIP but that isn’t as clear in Alameda.

Comparing visits and average of median time outside home. Again each point is a zip code at a specific date.

sf_sd_visits_cases_zip_by_date %>% 
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~median_not_home_time_wtavg, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "San Francisco visits per capita and median not home time")
ac_sd_visits_cases_zip_by_date %>% 
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~median_not_home_time_wtavg, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "Alameda visits per capita and median not home time")

Again pre and post SIP.

sf_sd_visits_cases_zip_by_date %>% filter(date < shelter_date) %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~median_not_home_time_wtavg, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "San Francisco visits per capita and median not home time, pre SIP")
  sf_sd_visits_cases_zip_by_date %>% filter(date >= shelter_date) %>% 
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~median_not_home_time_wtavg, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "San Francisco visits per capita and median not home time, post SIP")
ac_sd_visits_cases_zip_by_date %>% filter(date < shelter_date) %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~median_not_home_time_wtavg, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "Alameda visits per capita and median not home time, pre SIP")
ac_sd_visits_cases_zip_by_date %>% filter(date >= shelter_date) %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~median_not_home_time_wtavg, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "Alameda visits per capita and median not home time, post SIP")

94132 seems to be suddenly spending a lot of time outside home post SIP, as well as 94708. 94708 is near Berkeley. What is 94132?

mapview(zctas_94_95 %>% filter(ZCTA5CE10 == "94132"))

Hmm..it’s right by Lake Merced park…maybe people who live near there are just getting out a lot more with SIP, or people are being marked incorrectly there?

Also interesting how before SIP, we see higher visits per capita seemingly associated with higher time not at home, as we would expect, but this seems to go away after SIP. But maybe the outliers are obscuring things?

ac_sd_visits_cases_zip_by_date %>% filter(date >= shelter_date) %>%
  filter(zipcode != "94708") %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~median_not_home_time_wtavg, type = 'scatter', mode = 'markers', color = ~zipcode) %>%
  layout(title = "Alameda visits per capita and median not home time, post SIP, 94708 removed")

Even when we get rid of that outlier the relationship isn’t seeming to be as clear.

Change in cases 4/21-present and visits since 4/16 through 5/24

Note that now zip codes are also listed in the labels that you see when you hover over a point.

visits_start_date <- as.Date("2020-04-16")
visits_end_date <- as.Date("2020-05-24")
cases_min_date <- as.Date("2020-04-21")

# SF first
# get initial cases
sf_init_cases <- sf_cases_zip %>% filter(date == cases_min_date) %>%
  dplyr::select(zipcode, cases_by_pop) %>% 
  rename(init_cases_by_pop = cases_by_pop) %>%
  mutate(log_init_cases_by_pop = log(init_cases_by_pop))

# summarize and add current and initial cases
sf_sd_visits_cases_zip_change <- sf_sd_visits_cases_zip_by_date %>% 
  filter(date >= visits_start_date & date <= visits_end_date) %>%
  filter(!is.na(zipcode) & !is.na(total_visits_high)) %>%
  group_by(zipcode) %>%
  summarize(total_visits_high = sum(total_visits_high), 
            total_visits_low = sum(total_visits_low)) %>%
  left_join(sf_cases_zip %>% filter(date == max(date)) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
  mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
         visits_per_capita = total_visits_avg / total_pop_zip) %>%
  filter(!is.na(cases_by_pop)) %>%
  left_join(sf_init_cases) %>%
  mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
         change_cases_by_pop = cases_by_pop - init_cases_by_pop)
  
# Alameda
# get initial cases
ac_init_cases <- ac_cases_zip %>% filter(date == cases_min_date) %>%
  dplyr::select(zipcode, cases_by_pop) %>% 
  rename(init_cases_by_pop = cases_by_pop) %>%
  mutate(log_init_cases_by_pop = log(init_cases_by_pop))

# summarize and add current and initial cases
ac_sd_visits_cases_zip_change <- ac_sd_visits_cases_zip_by_date %>% 
  filter(date >= visits_start_date & date <= visits_end_date) %>%
  filter(!is.na(zipcode) & !is.na(total_visits_high)) %>%
  group_by(zipcode) %>%
  summarize(total_visits_high = sum(total_visits_high), 
            total_visits_low = sum(total_visits_low)) %>%
  left_join(ac_cases_zip %>% filter(date == max(date)) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
  mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
         visits_per_capita = total_visits_avg / total_pop_zip) %>%
  filter(!is.na(cases_by_pop)) %>%
  left_join(ac_init_cases) %>%
  mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
         change_cases_by_pop = cases_by_pop - init_cases_by_pop)

# plot
# San Francisco
sf_sd_visits_cases_zip_change %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(change_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in cases per person from 4/21-present'), title = "San Francisco visits/person and change in cases/person")
model_sf_visits_cases_change <- lm(change_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change)

summary(model_sf_visits_cases_change)
## 
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = sf_sd_visits_cases_zip_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0010835 -0.0007015 -0.0003363  0.0004772  0.0018545 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)       -2.918e-04  7.176e-04  -0.407   0.6880  
## visits_per_capita  6.773e-05  3.810e-05   1.778   0.0887 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0008994 on 23 degrees of freedom
## Multiple R-squared:  0.1208, Adjusted R-squared:  0.08256 
## F-statistic:  3.16 on 1 and 23 DF,  p-value: 0.0887
sf_sd_visits_cases_zip_change %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/24'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "San Francisco visits/person and change in cases/person")
model_sf_visits_cases_change_log <- lm(change_log_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change)

summary(model_sf_visits_cases_change_log)
## 
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = sf_sd_visits_cases_zip_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48337 -0.21345 -0.06558  0.10117  1.98881 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)       0.429974   0.387456   1.110    0.279
## visits_per_capita 0.005519   0.020573   0.268    0.791
## 
## Residual standard error: 0.4856 on 23 degrees of freedom
## Multiple R-squared:  0.00312,    Adjusted R-squared:  -0.04022 
## F-statistic: 0.07198 on 1 and 23 DF,  p-value: 0.7909

What’s up with that outlier in the change in log of cases plot? It’s zip code 94108. Let’s look at their values.

print(sf_sd_visits_cases_zip_change %>% filter(zipcode == "94108") %>% dplyr::select(confirmed_cases, cases_by_pop, total_pop_zip, init_cases_by_pop))
## # A tibble: 1 x 4
##   confirmed_cases cases_by_pop total_pop_zip init_cases_by_pop
##             <dbl>        <dbl>         <dbl>             <dbl>
## 1              12     0.000693         17320         0.0000577

It looks like they just started from a really low case number so the case growth just looks very exponential. Actually, were they the minimum initial case number?

print(sf_sd_visits_cases_zip_change$init_cases_by_pop)
##  [1] 1.742619e-03 2.729537e-03 1.179245e-03 1.316284e-03 3.637320e-03
##  [6] 5.773672e-05 1.045576e-03 2.306105e-03 2.839296e-04 1.557491e-03
## [11] 8.255523e-04 2.259215e-03 4.707887e-04 8.254230e-04 6.137121e-04
## [16] 7.543832e-04 5.720637e-04 1.066710e-03 2.782149e-03 1.001581e-03
## [21] 1.129495e-03 6.316169e-04 6.213223e-04 1.629767e-03 1.140344e-03

Yep, they were, and pretty significantly lower than all the others too. May be reasonable to exclude them then. Try plotting again without them in the log plot.

sf_sd_visits_cases_zip_change_edit <- sf_sd_visits_cases_zip_change %>% filter(zipcode != "94108") 

sf_sd_visits_cases_zip_change_edit %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change_edit)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/24'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "San Francisco visits/person and change in cases/person, outlier removed")
model_sf_visits_cases_change_log <- lm(change_log_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change_edit)

summary(model_sf_visits_cases_change_log)
## 
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = sf_sd_visits_cases_zip_change_edit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3447 -0.1535 -0.0036  0.1294  0.4829 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       -0.116711   0.176917  -0.660  0.51630   
## visits_per_capita  0.030601   0.009279   3.298  0.00328 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2109 on 22 degrees of freedom
## Multiple R-squared:  0.3308, Adjusted R-squared:  0.3004 
## F-statistic: 10.87 on 1 and 22 DF,  p-value: 0.00328

Now that is more significant!

Alameda:

ac_sd_visits_cases_zip_change %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(change_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in cases per person from 4/21-present'), title = "Alameda visits/person and change in cases/person")
model_ac_visits_cases_change <- lm(change_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_change)

summary(model_ac_visits_cases_change)
## 
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = ac_sd_visits_cases_zip_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0013365 -0.0006413 -0.0002422  0.0002693  0.0047075 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       -9.385e-04  7.135e-04  -1.315  0.19517   
## visits_per_capita  8.124e-05  2.900e-05   2.801  0.00754 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001097 on 44 degrees of freedom
## Multiple R-squared:  0.1513, Adjusted R-squared:  0.1321 
## F-statistic: 7.847 on 1 and 44 DF,  p-value: 0.007538
ac_sd_visits_cases_zip_change %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/23'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "Alameda visits/person and change in cases/person")
model_ac_visits_cases_change_log <- lm(change_log_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_change)

summary(model_ac_visits_cases_change_log)
## 
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = ac_sd_visits_cases_zip_change)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56302 -0.26967 -0.06993  0.25275  1.13273 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        0.15033    0.26810   0.561    0.578  
## visits_per_capita  0.02414    0.01090   2.215    0.032 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4123 on 44 degrees of freedom
## Multiple R-squared:  0.1003, Adjusted R-squared:  0.07986 
## F-statistic: 4.906 on 1 and 44 DF,  p-value: 0.03199

Note that the R squared went down from my last version of this–partly because I included a couple university zip codes that I left out last time, and partly since some case counts went up. I also changed the starting date and ending date on visits as well, slightly.

It’s worth checking on the initial case levels for these and excluding any extreme outliers (i.e. that start very low), since we did that for SF.

print(ac_sd_visits_cases_zip_change$init_cases_by_pop)
##  [1] 0.0003675883 0.0002993653 0.0003506501 0.0004110937 0.0004635788
##  [6] 0.0013582395 0.0014801916 0.0018417789 0.0013664106 0.0013429540
## [11] 0.0003946174 0.0002242926 0.0002688606 0.0003583933 0.0009958409
## [16] 0.0007522190 0.0011886801 0.0009560991 0.0012949260 0.0007977101
## [21] 0.0009038381 0.0007933360 0.0004790929 0.0010115662 0.0004745634
## [26] 0.0011293545 0.0009484374 0.0006120802 0.0005821145 0.0004229403
## [31] 0.0002848841 0.0003949447 0.0007014028 0.0006825939 0.0003691944
## [36] 0.0029313406 0.0015452346 0.0002951594 0.0004711526 0.0001635323
## [41] 0.0004013163 0.0002999580 0.0005510856 0.0005215396 0.0004167709
## [46] 0.0006386512

None of those look unreasonable, actually.

Maybe we should also identify the university zip codes?

ac_sd_visits_cases_zip_change_univ_mark <- ac_sd_visits_cases_zip_change %>%
  mutate(univ = zipcode %in% univ_zips)

ac_sd_visits_cases_zip_change_univ_mark %>%
  plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode, color = ~univ) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_change_univ_mark)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/23'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "Alameda visits/person and change in cases/person, univ zips marked")

Worth looking at.

Plot both on the same graph. Note that I excluded the SF outlier.

ac_sf_sd_visits_cases_zip_change <- rbind(ac_sd_visits_cases_zip_change %>% dplyr::select(zipcode, change_log_cases_by_pop, visits_per_capita, change_cases_by_pop) %>% cbind(county = 'Alameda'), sf_sd_visits_cases_zip_change_edit %>% dplyr::select(zipcode, change_log_cases_by_pop, visits_per_capita, change_cases_by_pop) %>% cbind(county = 'San Francisco'))

ac_sf_sd_visits_cases_zip_change %>% plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode, color = ~county) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/24'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "Visits/person and change in cases/person, SF and Alameda")
model_combined_visits_cases_change_log <- lm(change_log_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change)

summary(model_combined_visits_cases_change_log)
## 
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = ac_sf_sd_visits_cases_zip_change)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5443 -0.2429 -0.0679  0.1839  1.1529 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -0.046845   0.166364  -0.282    0.779    
## visits_per_capita  0.030776   0.007279   4.228 7.21e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.358 on 68 degrees of freedom
## Multiple R-squared:  0.2081, Adjusted R-squared:  0.1965 
## F-statistic: 17.87 on 1 and 68 DF,  p-value: 7.207e-05

Lower R squared than SF on its own, higher than Alameda on its own, but p value more significant than either on their own.

Also do not log just to compare.

ac_sf_sd_visits_cases_zip_change %>% plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode, color = ~county) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(change_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/24'), yaxis = list(title = 'change in cases per person from 4/21-present'), title = "Visits/person and change in cases/person, SF and Alameda")
model_combined_visits_cases_change <- lm(change_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change)

summary(model_combined_visits_cases_change)
## 
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = ac_sf_sd_visits_cases_zip_change)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0013328 -0.0006430 -0.0002723  0.0002319  0.0046490 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       -4.387e-04  4.827e-04  -0.909  0.36671   
## visits_per_capita  6.469e-05  2.112e-05   3.063  0.00314 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001039 on 68 degrees of freedom
## Multiple R-squared:  0.1212, Adjusted R-squared:  0.1083 
## F-statistic:  9.38 on 1 and 68 DF,  p-value: 0.003141

Splitting into two time chunks

Try looking at change in cases from 4/21 to 5/12, and 5/12 to present, and visits from 4/16 through 5/6 and 5/7 through 5/24.

visits_start_date_1 <- as.Date("2020-04-16")
visits_end_date_1 <- as.Date("2020-05-06")
cases_end_date_1 <- as.Date("2020-05-12")
visits_end_date_2 <- as.Date("2020-05-24")

# SF first
# first initial cases are the same as last time
sf_init_cases_1 <- sf_init_cases
# second initial cases start the end date of first segment
sf_init_cases_2 <- sf_cases_zip %>% filter(date == cases_end_date_1) %>%
  dplyr::select(zipcode, cases_by_pop) %>% 
  rename(init_cases_by_pop = cases_by_pop) %>%
  mutate(log_init_cases_by_pop = log(init_cases_by_pop))

# summarize and add current and initial cases, first segment
sf_sd_visits_cases_zip_change_1 <- sf_sd_visits_cases_zip_by_date %>% 
  filter(date >= visits_start_date_1 & date <= visits_end_date_1) %>%
  filter(!is.na(zipcode) & !is.na(total_visits_high)) %>%
  group_by(zipcode) %>%
  summarize(total_visits_high = sum(total_visits_high), 
            total_visits_low = sum(total_visits_low)) %>%
  left_join(sf_cases_zip %>% filter(date == cases_end_date_1) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
  mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
         visits_per_capita = total_visits_avg / total_pop_zip) %>%
  filter(!is.na(cases_by_pop)) %>%
  left_join(sf_init_cases_1) %>%
  mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
         change_cases_by_pop = cases_by_pop - init_cases_by_pop)

# summarize and add current and initial cases, second segment
sf_sd_visits_cases_zip_change_2 <- sf_sd_visits_cases_zip_by_date %>% 
  filter(date > visits_end_date_1 & date <= visits_end_date_2) %>%
  filter(!is.na(zipcode) & !is.na(total_visits_high)) %>%
  group_by(zipcode) %>%
  summarize(total_visits_high = sum(total_visits_high), 
            total_visits_low = sum(total_visits_low)) %>%
  left_join(sf_cases_zip %>% filter(date == max(date)) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
  mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
         visits_per_capita = total_visits_avg / total_pop_zip) %>%
  filter(!is.na(cases_by_pop)) %>%
  left_join(sf_init_cases_2) %>%
  mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
         change_cases_by_pop = cases_by_pop - init_cases_by_pop)

# Alameda
# first initial cases are the same as last time
ac_init_cases_1 <- ac_init_cases
# second initial cases start the end date of first segment
ac_init_cases_2 <- ac_cases_zip %>% filter(date == cases_end_date_1) %>%
  dplyr::select(zipcode, cases_by_pop) %>% 
  rename(init_cases_by_pop = cases_by_pop) %>%
  mutate(log_init_cases_by_pop = log(init_cases_by_pop))

# summarize and add current and initial cases, first segment
ac_sd_visits_cases_zip_change_1 <- ac_sd_visits_cases_zip_by_date %>% 
  filter(date >= visits_start_date_1 & date <= visits_end_date_1) %>%
  filter(!is.na(zipcode) & !is.na(total_visits_high)) %>%
  group_by(zipcode) %>%
  summarize(total_visits_high = sum(total_visits_high), 
            total_visits_low = sum(total_visits_low)) %>%
  left_join(ac_cases_zip %>% filter(date == cases_end_date_1) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
  mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
         visits_per_capita = total_visits_avg / total_pop_zip) %>%
  filter(!is.na(cases_by_pop)) %>%
  left_join(ac_init_cases_1) %>%
  mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
         change_cases_by_pop = cases_by_pop - init_cases_by_pop)

# summarize and add current and initial cases, second segment
ac_sd_visits_cases_zip_change_2 <- ac_sd_visits_cases_zip_by_date %>% 
  filter(date > visits_end_date_1 & date <= visits_end_date_2) %>%
  filter(!is.na(zipcode) & !is.na(total_visits_high)) %>%
  group_by(zipcode) %>%
  summarize(total_visits_high = sum(total_visits_high), 
            total_visits_low = sum(total_visits_low)) %>%
  left_join(ac_cases_zip %>% filter(date == max(date)) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
  mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
         visits_per_capita = total_visits_avg / total_pop_zip) %>%
  filter(!is.na(cases_by_pop)) %>%
  left_join(ac_init_cases_2) %>%
  mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
         change_cases_by_pop = cases_by_pop - init_cases_by_pop)

# combine
ac_sf_sd_visits_cases_zip_change_2_chunks <- rbind(ac_sd_visits_cases_zip_change_1 %>% dplyr::select(zipcode, change_log_cases_by_pop, visits_per_capita, change_cases_by_pop) %>% cbind(county = 'Alameda', time_chunk = 1), ac_sd_visits_cases_zip_change_2 %>% dplyr::select(zipcode, change_log_cases_by_pop, visits_per_capita, change_cases_by_pop) %>% cbind(county = 'Alameda', time_chunk = 2), sf_sd_visits_cases_zip_change_1 %>% dplyr::select(zipcode, change_log_cases_by_pop, visits_per_capita, change_cases_by_pop) %>% cbind(county = 'San Francisco', time_chunk = 1), sf_sd_visits_cases_zip_change_2 %>% dplyr::select(zipcode, change_log_cases_by_pop, visits_per_capita, change_cases_by_pop) %>% cbind(county = 'San Francisco', time_chunk = 2))

ac_sf_sd_visits_cases_zip_change_2_chunks %>% plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode, color = ~county, symbol = ~time_chunk) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change_2_chunks)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total visit-hours per person'), yaxis = list(title = 'change in log(cases per person)'), title = "Visits/person and change in cases/person, SF and Alameda, 2 time chunks")
model_combined_visits_cases_change_log_2_chunks <- lm(change_log_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change_2_chunks)

summary(model_combined_visits_cases_change_log_2_chunks)
## 
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = ac_sf_sd_visits_cases_zip_change_2_chunks)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41012 -0.19040 -0.08574  0.11898  2.28188 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       0.064736   0.091765   0.705  0.48170   
## visits_per_capita 0.024129   0.008038   3.002  0.00318 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3021 on 140 degrees of freedom
## Multiple R-squared:  0.06047,    Adjusted R-squared:  0.05376 
## F-statistic:  9.01 on 1 and 140 DF,  p-value: 0.00318

Remove that one SF zipcode again.

ac_sf_sd_visits_cases_zip_change_2_chunks_edit <- ac_sf_sd_visits_cases_zip_change_2_chunks %>% filter(zipcode != "94108")

ac_sf_sd_visits_cases_zip_change_2_chunks_edit %>% plot_ly() %>%
  add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode, color = ~county, symbol = ~time_chunk) %>%
  add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change_2_chunks_edit)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total visit-hours per person'), yaxis = list(title = 'change in log(cases per person)'), title = "Visits/person and change in cases/person, SF and Alameda, 2 time chunks")
model_combined_visits_cases_change_log_2_chunks_2 <- lm(change_log_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change_2_chunks_edit)

summary(model_combined_visits_cases_change_log_2_chunks_2)
## 
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = ac_sf_sd_visits_cases_zip_change_2_chunks_edit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42199 -0.16292 -0.07314  0.13109  0.69572 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -0.039915   0.072014  -0.554     0.58    
## visits_per_capita  0.032269   0.006276   5.142 9.14e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2316 on 138 degrees of freedom
## Multiple R-squared:  0.1608, Adjusted R-squared:  0.1547 
## F-statistic: 26.44 on 1 and 138 DF,  p-value: 9.136e-07

Interesting, this doesn’t quite fit as well. But still significant. Might have to do with less time for cases to grow overall? A lot don’t have changes in this graph, which could be altering the results.

Case correlation with average household size

Looking at average household size and case data.

# block group populations
sf_pop_bg <- pullCensus("B01003_001E", sf_fips) %>% 
  rename(total_pop = B01003_001E)

ac_pop_bg <- pullCensus("B01003_001E", ac_fips) %>% 
  rename(total_pop = B01003_001E)

sf_avg_household_size <- pullCensus("B25010_001E", sf_fips) %>% 
  filter(B25010_001E > 0) %>%
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  left_join(sf_pop_bg) %>%
  group_by(zipcode) %>%
  summarize(avg_household_size = weighted.mean(B25010_001E, total_pop)) %>%
  filter(!is.na(zipcode))

ac_avg_household_size <- pullCensus("B25010_001E", ac_fips) %>% 
  filter(B25010_001E > 0) %>%
  left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  left_join(ac_pop_bg) %>%
  group_by(zipcode) %>%
  summarize(avg_household_size = weighted.mean(B25010_001E, total_pop)) %>%
  filter(!is.na(zipcode))

# correlate with case data - current cases
sf_cases_house_size_zip <- left_join(sf_cases_zip %>% filter(date == max(date)), sf_avg_household_size) %>% filter(!is.na(avg_household_size))

sf_cases_house_size_zip %>% plot_ly() %>%
  add_trace(x = ~avg_household_size, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~avg_household_size, y = fitted(lm(cases_by_pop ~ avg_household_size, sf_cases_house_size_zip)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Population weighted avg household size'), yaxis = list(title = 'Current cases per person'), title = "San Francisco County")
sf_model_household_size_cases <- lm(cases_by_pop ~ avg_household_size, sf_cases_house_size_zip)

summary(sf_model_household_size_cases)
## 
## Call:
## lm(formula = cases_by_pop ~ avg_household_size, data = sf_cases_house_size_zip)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0018051 -0.0012945 -0.0007524  0.0005495  0.0033981 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)        0.0009224  0.0014445   0.639    0.529
## avg_household_size 0.0005546  0.0005950   0.932    0.361
## 
## Residual standard error: 0.001716 on 23 degrees of freedom
## Multiple R-squared:  0.0364, Adjusted R-squared:  -0.005501 
## F-statistic: 0.8687 on 1 and 23 DF,  p-value: 0.361
# alameda
ac_cases_house_size_zip <- left_join(ac_cases_zip %>% filter(date == max(date)), ac_avg_household_size) %>% filter(!is.na(avg_household_size))

ac_cases_house_size_zip %>% plot_ly() %>%
  add_trace(x = ~avg_household_size, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~avg_household_size, y = fitted(lm(cases_by_pop ~ avg_household_size, ac_cases_house_size_zip)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Population weighted avg household size'), yaxis = list(title = 'Current cases per person'), title = "Alameda County")
ac_model_household_size_cases <- lm(cases_by_pop ~ avg_household_size, ac_cases_house_size_zip)

summary(ac_model_household_size_cases)
## 
## Call:
## lm(formula = cases_by_pop ~ avg_household_size, data = ac_cases_house_size_zip)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0019174 -0.0009146 -0.0001070  0.0007149  0.0042124 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.0029958  0.0012537  -2.390 0.021219 *  
## avg_household_size  0.0016881  0.0004395   3.841 0.000389 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001346 on 44 degrees of freedom
## Multiple R-squared:  0.2511, Adjusted R-squared:  0.2341 
## F-statistic: 14.75 on 1 and 44 DF,  p-value: 0.0003895

Interesting–a significant relationship in Alameda but not San Francisco. This adds to the story that is seeming to build that SF might be different from other places.

Just to compare, because I’m now curious, going to use the data from SCC too.

# block groups
scc_blockgroups <-
  block_groups("CA","Santa Clara", cb=F, progress_bar=F) %>%
  st_transform('+proj=longlat +datum=WGS84')

scc_bgs <- scc_blockgroups %>% pull(GEOID)

# join with zip codes
scc_bg_zctas <- scc_blockgroups %>% 
  st_centroid() %>%
  st_join(zctas_94_95) %>% 
  dplyr::select(GEOID, ZCTA5CE10) %>%
  rename(blockgroup = GEOID, zipcode = ZCTA5CE10)

# get case data
scc_map_cases <- esri2sf("https://services2.arcgis.com/RiZWfy7B1r76pKTz/arcgis/rest/services/COVID_zip_FC/FeatureServer/0")
## [1] "Feature Layer"
## [1] "esriGeometryPolygon"
scc_cases_zip <- scc_map_cases %>% 
  dplyr::select(zipcode, Cases, Population) %>% 
  st_drop_geometry() %>%
  rename(cases = Cases, pop = Population) %>%
  mutate(cases = replace_na(cases, 0)) %>%
  mutate(cases_by_pop = cases/pop) %>%
  filter(is.numeric(cases_by_pop))

# get population data 
scc_fips <- fips("CA", "Santa Clara") %>% substr(3,5)

scc_pop_zip <- pullCensus("B01003_001E", scc_fips) %>% 
  left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_pop_zip = sum(B01003_001E))

scc_pop_bg <- pullCensus("B01003_001E", scc_fips) %>%
  rename(total_pop = B01003_001E)

scc_avg_household_size <- pullCensus("B25010_001E", scc_fips) %>% 
  filter(B25010_001E > 0) %>%
  left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  left_join(scc_pop_bg) %>%
  group_by(zipcode) %>%
  summarize(avg_household_size = weighted.mean(B25010_001E, total_pop)) %>%
  filter(!is.na(zipcode))

# correlate with case data - current cases
scc_cases_house_size_zip <- left_join(scc_cases_zip, scc_avg_household_size) %>% filter(!is.na(avg_household_size))

scc_cases_house_size_zip %>% plot_ly() %>%
  add_trace(x = ~avg_household_size, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~avg_household_size, y = fitted(lm(cases_by_pop ~ avg_household_size, scc_cases_house_size_zip)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Population weighted avg household size'), yaxis = list(title = 'Current cases per person'), title = "Santa Clara County")
scc_model_household_size_cases <- lm(cases_by_pop ~ avg_household_size, scc_cases_house_size_zip)

summary(scc_model_household_size_cases)
## 
## Call:
## lm(formula = cases_by_pop ~ avg_household_size, data = scc_cases_house_size_zip)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0017069 -0.0004330 -0.0000825  0.0001695  0.0038324 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        -0.0007294  0.0006548  -1.114  0.27031   
## avg_household_size  0.0006462  0.0002144   3.015  0.00394 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000879 on 53 degrees of freedom
## Multiple R-squared:  0.1464, Adjusted R-squared:  0.1303 
## F-statistic: 9.088 on 1 and 53 DF,  p-value: 0.003943

Again more of what we would expect. SF does seem to be odd.

Weighting visits

First get the areas of the POIs.

poi_areas <- read_csv('/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/geo-supplement/May2020Release/SafeGraphPlacesGeoSupplementSquareFeet.csv.gz')

poi_areas_selected <- poi_areas %>% dplyr::select(safegraph_place_id, area_square_feet)

I need hourly visits for each day to do this. That unfortunately takes me quite a while to run, so we’ll see how this goes. Going to limit it to 2 weeks at a time for now, starting with just the ones relevant for the previous analyses I was doing.

# files_to_process <- list.files(paste0(sg_hourly_path, "ByDay/"), pattern = "day")
# 
# files_to_process_selected <- files_to_process[64:77] # 5/11-5/24
# 
# bay_hourly_visits_places <- NULL
# ac_hourly_visits_zip <- NULL
# sf_hourly_visits_zip <- NULL
# for (i in 1:length(files_to_process_selected)) {
#   curr_bay_hourly_origins <- readRDS(paste0(sg_hourly_path, "ByDay/", files_to_process_selected[i])) %>% dplyr::select(safegraph_place_id, date, hour, visit_counts_hourly_high, visit_counts_hourly_low, origin_census_block_group, pop) %>% 
#     filter(!is.na(pop))
#   
#   # get hourly visits to each safegraph place
#   curr_bay_hourly_visits_places <- curr_bay_hourly_origins %>% 
#     group_by(safegraph_place_id, hour, date) %>%
#   summarize(total_visits_hourly_high = sum(visit_counts_hourly_high), total_visits_hourly_low = sum(visit_counts_hourly_low)) %>%
#   mutate(total_visits_hourly_avg = (total_visits_hourly_high + total_visits_hourly_low) / 2) %>%
#   ungroup()
#   
#   # get hourly visits by each zip code to each place in Alameda 
#   curr_ac_hourly_visits_zip <- curr_bay_hourly_origins %>% filter(origin_census_block_group %in% ac_bgs) %>% 
#     left_join(ac_bg_zctas %>% as.data.frame() %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
#     group_by(zipcode, date, hour, safegraph_place_id) %>% 
#     summarize(total_visits_high = sum(visit_counts_hourly_high), total_visits_low = sum(visit_counts_hourly_low)) %>%
#     mutate(total_visits_avg = (total_visits_high + total_visits_low)/2)
#   
#   # get hourly visits by each zip code to each place in San Francisco
#   curr_sf_hourly_visits_zip <- curr_bay_hourly_origins %>% filter(origin_census_block_group %in% sf_bgs) %>% 
#     left_join(sf_bg_zctas %>% as.data.frame() %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
#     group_by(zipcode, date, hour, safegraph_place_id) %>% 
#     summarize(total_visits_high = sum(visit_counts_hourly_high), total_visits_low = sum(visit_counts_hourly_low)) %>%
#     mutate(total_visits_avg = (total_visits_high + total_visits_low)/2)
#   
#   # combine
#   bay_hourly_visits_places <- rbind(bay_hourly_visits_places, curr_bay_hourly_visits_places)
#   ac_hourly_visits_zip <- rbind(ac_hourly_visits_zip, curr_ac_hourly_visits_zip)
#   sf_hourly_visits_zip <- rbind(sf_hourly_visits_zip, curr_sf_hourly_visits_zip)
# }
# 
# # save the rds files
# saveRDS(bay_hourly_visits_places, paste0(sg_hourly_path, "bay_hourly_visits_places_5-11_5-24.rds"))
# saveRDS(ac_hourly_visits_zip, paste0(sg_hourly_path, "ac_hourly_visits_zip_5-11_5-24.rds"))
# saveRDS(sf_hourly_visits_zip, paste0(sg_hourly_path, "sf_hourly_visits_zip_5-11_5-24.rds"))

# # join the visits to places files with the areas and calculate visit/area ratios
# bay_hourly_visits_places_413_426 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_4-13_4-26.rds"))
# bay_hourly_visits_places_427_510 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_4-27_5-10.rds"))
# bay_hourly_visits_places_511_524 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_5-11_5-24.rds"))
# 
# # left join with areas
# bay_hourly_visits_places_413_426 <- bay_hourly_visits_places_413_426 %>% left_join(poi_areas_selected) %>% 
#   mutate(hourly_visits_per_area = total_visits_hourly_avg / area_square_feet)
# bay_hourly_visits_places_427_510 <- bay_hourly_visits_places_427_510 %>% left_join(poi_areas_selected) %>% 
#   mutate(hourly_visits_per_area = total_visits_hourly_avg / area_square_feet)
# bay_hourly_visits_places_511_524 <- bay_hourly_visits_places_511_524 %>% left_join(poi_areas_selected) %>% 
#   mutate(hourly_visits_per_area = total_visits_hourly_avg / area_square_feet)
# 
# # re-save the RDS files
# saveRDS(bay_hourly_visits_places_413_426, paste0(sg_hourly_path, "bay_hourly_visits_places_4-13_4-26.rds"))
# saveRDS(bay_hourly_visits_places_427_510, paste0(sg_hourly_path, "bay_hourly_visits_places_4-27_5-10.rds"))
# saveRDS(bay_hourly_visits_places_511_524, paste0(sg_hourly_path, "bay_hourly_visits_places_5-11_5-24.rds"))

# load the files now that they have been processed
# load places and weights
bay_hourly_visits_places_413_426 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_4-13_4-26.rds"))
bay_hourly_visits_places_427_510 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_4-27_5-10.rds"))
bay_hourly_visits_places_511_524 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_5-11_5-24.rds"))

# load visits by origin
ac_hourly_visits_zip_413_426 <- readRDS(paste0(sg_hourly_path, "ac_hourly_visits_zip_4-13_4-26.rds"))
ac_hourly_visits_zip_427_510 <- readRDS(paste0(sg_hourly_path, "ac_hourly_visits_zip_4-27_5-10.rds"))
ac_hourly_visits_zip_511_524 <- readRDS(paste0(sg_hourly_path, "ac_hourly_visits_zip_5-11_5-24.rds"))
sf_hourly_visits_zip_413_426 <- readRDS(paste0(sg_hourly_path, "sf_hourly_visits_zip_4-13_4-26.rds"))
sf_hourly_visits_zip_427_510 <- readRDS(paste0(sg_hourly_path, "sf_hourly_visits_zip_4-27_5-10.rds"))
sf_hourly_visits_zip_511_524 <- readRDS(paste0(sg_hourly_path, "sf_hourly_visits_zip_5-11_5-24.rds"))
  
# now add the weights (visits per hour per area) to the visits data by origin
ac_hourly_visits_zip_413_426_weights <- left_join(ac_hourly_visits_zip_413_426, bay_hourly_visits_places_413_426 %>% dplyr::select(safegraph_place_id, hour, date, total_visits_hourly_avg, area_square_feet, hourly_visits_per_area) %>% rename(place_visits_per_area = hourly_visits_per_area, place_total_visits_avg = total_visits_hourly_avg)) %>% 
  mutate(weighted_visits_by_ratio = total_visits_avg * place_visits_per_area, weighted_visits_by_ratio_sqr = total_visits_avg * (place_visits_per_area)^2)

ac_hourly_visits_zip_427_510_weights <- left_join(ac_hourly_visits_zip_427_510, bay_hourly_visits_places_427_510 %>% dplyr::select(safegraph_place_id, hour, date, total_visits_hourly_avg, area_square_feet, hourly_visits_per_area) %>% rename(place_visits_per_area = hourly_visits_per_area, place_total_visits_avg = total_visits_hourly_avg)) %>% 
  mutate(weighted_visits_by_ratio = total_visits_avg * place_visits_per_area, weighted_visits_by_ratio_sqr = total_visits_avg * (place_visits_per_area)^2)

ac_hourly_visits_zip_511_524_weights <- left_join(ac_hourly_visits_zip_511_524, bay_hourly_visits_places_511_524 %>% dplyr::select(safegraph_place_id, hour, date, total_visits_hourly_avg, area_square_feet, hourly_visits_per_area) %>% rename(place_visits_per_area = hourly_visits_per_area, place_total_visits_avg = total_visits_hourly_avg)) %>% 
  mutate(weighted_visits_by_ratio = total_visits_avg * place_visits_per_area, weighted_visits_by_ratio_sqr = total_visits_avg * (place_visits_per_area)^2)

# SF
sf_hourly_visits_zip_413_426_weights <- left_join(sf_hourly_visits_zip_413_426, bay_hourly_visits_places_413_426 %>% dplyr::select(safegraph_place_id, hour, date, total_visits_hourly_avg, area_square_feet, hourly_visits_per_area) %>% rename(place_visits_per_area = hourly_visits_per_area, place_total_visits_avg = total_visits_hourly_avg)) %>% 
  mutate(weighted_visits_by_ratio = total_visits_avg * place_visits_per_area, weighted_visits_by_ratio_sqr = total_visits_avg * (place_visits_per_area)^2)

sf_hourly_visits_zip_427_510_weights <- left_join(sf_hourly_visits_zip_427_510, bay_hourly_visits_places_427_510 %>% dplyr::select(safegraph_place_id, hour, date, total_visits_hourly_avg, area_square_feet, hourly_visits_per_area) %>% rename(place_visits_per_area = hourly_visits_per_area, place_total_visits_avg = total_visits_hourly_avg)) %>% 
  mutate(weighted_visits_by_ratio = total_visits_avg * place_visits_per_area, weighted_visits_by_ratio_sqr = total_visits_avg * (place_visits_per_area)^2)

sf_hourly_visits_zip_511_524_weights <- left_join(sf_hourly_visits_zip_511_524, bay_hourly_visits_places_511_524 %>% dplyr::select(safegraph_place_id, hour, date, total_visits_hourly_avg, area_square_feet, hourly_visits_per_area) %>% rename(place_visits_per_area = hourly_visits_per_area, place_total_visits_avg = total_visits_hourly_avg)) %>% 
  mutate(weighted_visits_by_ratio = total_visits_avg * place_visits_per_area, weighted_visits_by_ratio_sqr = total_visits_avg * (place_visits_per_area)^2)

For clarity, here is a table exemplifying the data I am using. weighted_visits_by_ratio is the visits by that zip code multiplied by the number of total visits to that location over the location’s area, weighed_visits_by_ratio_sqr is the same except the (total visits to that location over the location’s area) is squared.

kable(ac_hourly_visits_zip_413_426_weights %>% dplyr::select(zipcode, date, hour, safegraph_place_id, total_visits_avg, place_total_visits_avg, area_square_feet, place_visits_per_area, weighted_visits_by_ratio, weighted_visits_by_ratio_sqr) %>% 
    .[1:50,])
zipcode date hour safegraph_place_id total_visits_avg place_total_visits_avg area_square_feet place_visits_per_area weighted_visits_by_ratio weighted_visits_by_ratio_sqr
94501 2020-04-13 1 sg:04c6d02363974bd38d1c2a4bee4c32a4 0.2919753 39.426037 1374160 0.0000287 0.0000084 0.0000000
94501 2020-04-13 1 sg:078a04c8a7fb4656a543e665527ea159 1.3666995 29.431109 570196 0.0000516 0.0000705 0.0000000
94501 2020-04-13 1 sg:0a2d5058e3714b7a8b4d953bf0adf62f 1.2142213 36.756980 95093129 0.0000004 0.0000005 0.0000000
94501 2020-04-13 1 sg:0fb0ca4cde7c4ef4b905340d1faeacfb 0.3396178 36.774126 151676 0.0002425 0.0000823 0.0000000
94501 2020-04-13 1 sg:111ab24c7047429585c803eb7ae99bdb 0.1609400 32.990999 561521 0.0000588 0.0000095 0.0000000
94501 2020-04-13 1 sg:1150eab79ba14eaea07dd966f27a5e85 36.9444864 36.944486 3450 0.0107085 0.3956218 0.0042365
94501 2020-04-13 1 sg:1509cc41681349d0b7a622cb635aac5c 1.1559252 34.330875 346612 0.0000990 0.0001145 0.0000000
94501 2020-04-13 1 sg:1735062c1ee34df7bb39127cb912ce66 0.7336302 32.317342 114621 0.0002819 0.0002068 0.0000001
94501 2020-04-13 1 sg:1941a94853df45aeb756f2026ca0ffe4 6.4515969 30.776152 310790 0.0000990 0.0006389 0.0000001
94501 2020-04-13 1 sg:1eeea35698bf486d8cb08a371f3eb823 0.9040179 19.608160 12415 0.0015794 0.0014278 0.0000023
94501 2020-04-13 1 sg:22a5672c0bfb4cae874ffcc90324b400 0.2930957 31.897638 457493 0.0000697 0.0000204 0.0000000
94501 2020-04-13 1 sg:29193f4b97c64ec28437d2349bf40c70 1.4501053 36.556157 815391 0.0000448 0.0000650 0.0000000
94501 2020-04-13 1 sg:32006d4bb5ea43038f1e7c969f64418e 2.1093750 14.514719 3293 0.0044077 0.0092976 0.0000410
94501 2020-04-13 1 sg:373642ecded84befa5298c040ef42945 0.7101977 46.284456 13615965 0.0000034 0.0000024 0.0000000
94501 2020-04-13 1 sg:3929f806d3214533b703755ff589263f 0.3637460 42.320334 172807 0.0002449 0.0000891 0.0000000
94501 2020-04-13 1 sg:42829ee300c543f88dc7ed5b199dce40 9.7147059 53.968805 2028 0.0266118 0.2585262 0.0068799
94501 2020-04-13 1 sg:42928ac18b3647158af774c1c3555ef6 6.4084084 21.955688 2851 0.0077010 0.0493515 0.0003801
94501 2020-04-13 1 sg:4765d7f23a544d00b47da92ca70ee08b 5.9206349 37.184924 63055 0.0005897 0.0034915 0.0000021
94501 2020-04-13 1 sg:47be1a236aa446158486563d1b5af240 1.4603365 25.062989 25590 0.0009794 0.0014303 0.0000014
94501 2020-04-13 1 sg:52f794242e8f45dfa843623708ca360e 0.8445084 66.721845 703859 0.0000948 0.0000801 0.0000000
94501 2020-04-13 1 sg:5d44cabbaf844c69ac94500fb5a157a6 17.5000000 17.500000 4049 0.0043221 0.0756360 0.0003269
94501 2020-04-13 1 sg:66a3f924cbff40f890e94d3e0fe4048e 5.3212670 12.940193 12426 0.0010414 0.0055415 0.0000058
94501 2020-04-13 1 sg:68a4065d338e4ec1beda445a6625d371 24.2647059 33.962773 1351 0.0251390 0.6099902 0.0153345
94501 2020-04-13 1 sg:6bed7ca0592c4cceab22a0e301b04580 2.6185345 13.474323 33938 0.0003970 0.0010396 0.0000004
94501 2020-04-13 1 sg:734b5b7d28614c2baeb0ea7ddb24b527 4.7448815 31.299596 228746 0.0001368 0.0006492 0.0000001
94501 2020-04-13 1 sg:767fabd815164a438afc6485ab527988 2.0813665 46.353935 55298 0.0008383 0.0017447 0.0000015
94501 2020-04-13 1 sg:76d9ffcc840649cc96855014177a1bb5 3.5387755 14.029823 117220 0.0001197 0.0004235 0.0000001
94501 2020-04-13 1 sg:7712a59c880547b1b1f224efdcfd70b0 0.3804538 29.175450 522829 0.0000558 0.0000212 0.0000000
94501 2020-04-13 1 sg:82fe8855e6d8463fa00babdd3ed68459 0.7271142 23.125278 1005 0.0230102 0.0167311 0.0003850
94501 2020-04-13 1 sg:8e589cc810534e4bbc77b93c57464c19 1.1520241 34.247811 943769 0.0000363 0.0000418 0.0000000
94501 2020-04-13 1 sg:8fb6d7cba90447909e7999441edaa6d8 1.6071429 25.887155 129580 0.0001998 0.0003211 0.0000001
94501 2020-04-13 1 sg:91d2eacfbfb049558c8cb490540182d4 7.3355482 28.378668 9886 0.0028706 0.0210574 0.0000604
94501 2020-04-13 1 sg:9a368d0076254cfeb0f6aa71e327b060 1.3560268 6.085938 28822 0.0002112 0.0002863 0.0000001
94501 2020-04-13 1 sg:9a396f32f2d748efb313c54ca31a90f2 2.4860491 34.736260 52092 0.0006668 0.0016578 0.0000011
94501 2020-04-13 1 sg:aef80f68e47c4a5aa5b2ac569865f0c1 0.8728448 51.025657 24943 0.0020457 0.0017856 0.0000037
94501 2020-04-13 1 sg:b55da1b654de4f7ebe896497feaef6b1 13.8468182 27.212177 12036 0.0022609 0.0313063 0.0000708
94501 2020-04-13 1 sg:c39e5d0bbc5040fe9e93b8cc08610983 17.5325581 21.192558 4897 0.0043277 0.0758750 0.0003284
94501 2020-04-13 1 sg:c4667ef299e945e081f0fba860078190 13.1452991 13.145299 8805185 0.0000015 0.0000196 0.0000000
94501 2020-04-13 1 sg:c4a1ebeb9f714cf0abd43b1f9df10b1a 38.5280000 154.466539 32373 0.0047715 0.1838349 0.0008772
94501 2020-04-13 1 sg:cb9a47c37a804f329e55310b2657a83d 1.9480620 11.351733 17528 0.0006476 0.0012616 0.0000008
94501 2020-04-13 1 sg:d795296c3f834841a393f167f0c0ac54 3.1308140 19.879307 7729 0.0025720 0.0080526 0.0000207
94501 2020-04-13 1 sg:d8f7933062f24dd0bc0e154c7d4ca084 1.2678725 18.227776 3340 0.0054574 0.0069193 0.0000378
94501 2020-04-13 1 sg:dd29ebfcd56f4be58c99edf56ae0a322 38.6842105 38.684210 7725 0.0050077 0.1937176 0.0009701
94501 2020-04-13 1 sg:ddfd196c3e6b44cea480a0d591b74c17 10.0426597 38.014404 1376 0.0276267 0.2774460 0.0076649
94501 2020-04-13 1 sg:ee389fc5cf294b15bcdc5968a4962d40 0.5472127 33.266603 2375819 0.0000140 0.0000077 0.0000000
94501 2020-04-13 1 sg:f1c51f98474e4e0889fba192e5d00a81 1.3372093 23.304180 114621 0.0002033 0.0002719 0.0000001
94501 2020-04-13 1 sg:f235487e0c5e4cb19cf3bf9856393596 4.2613857 33.709096 4606 0.0073185 0.0311870 0.0002282
94501 2020-04-13 1 sg:f460dad0a8694944b6401265e0708bdb 19.2053372 34.872349 97778 0.0003566 0.0068495 0.0000024
94501 2020-04-13 2 sg:01280c43a0e54530b6182b24b66fdff8 0.4576760 20.068828 44874 0.0004472 0.0002047 0.0000001
94501 2020-04-13 2 sg:02ad023976754a07bdd0c95ebbc59d6b 24.1659225 34.980391 14318 0.0024431 0.0590399 0.0001442

Summarize for each day and join into a single data frame for each county.

ac_daily_visits_zip_413_524_weights <- rbind(ac_hourly_visits_zip_413_426_weights %>% 
  group_by(zipcode, date) %>%
  summarize(total_weighted_visits_by_ratio = sum(weighted_visits_by_ratio), total_weighted_visits_by_ratio_sqr = sum(weighted_visits_by_ratio_sqr)),
  ac_hourly_visits_zip_427_510_weights %>% 
  group_by(zipcode, date) %>%
  summarize(total_weighted_visits_by_ratio = sum(weighted_visits_by_ratio), total_weighted_visits_by_ratio_sqr = sum(weighted_visits_by_ratio_sqr)),
  ac_hourly_visits_zip_511_524_weights %>% 
  group_by(zipcode, date) %>%
  summarize(total_weighted_visits_by_ratio = sum(weighted_visits_by_ratio), total_weighted_visits_by_ratio_sqr = sum(weighted_visits_by_ratio_sqr))
  )

sf_daily_visits_zip_413_524_weights <- rbind(sf_hourly_visits_zip_413_426_weights %>% 
  group_by(zipcode, date) %>%
  summarize(total_weighted_visits_by_ratio = sum(weighted_visits_by_ratio), total_weighted_visits_by_ratio_sqr = sum(weighted_visits_by_ratio_sqr)),
  sf_hourly_visits_zip_427_510_weights %>% 
  group_by(zipcode, date) %>%
  summarize(total_weighted_visits_by_ratio = sum(weighted_visits_by_ratio), total_weighted_visits_by_ratio_sqr = sum(weighted_visits_by_ratio_sqr)),
  sf_hourly_visits_zip_511_524_weights %>% 
  group_by(zipcode, date) %>%
  summarize(total_weighted_visits_by_ratio = sum(weighted_visits_by_ratio), total_weighted_visits_by_ratio_sqr = sum(weighted_visits_by_ratio_sqr))
  )

Can now compare to the previous visits data.

sf_sd_visits_cases_zip_by_date_w_weighted <- left_join(sf_sd_visits_cases_zip_by_date, sf_daily_visits_zip_413_524_weights) %>% 
  mutate(weighted_visits_per_capita = total_weighted_visits_by_ratio / total_pop_zip, weighted_visits_ratio_sqr_per_capita = total_weighted_visits_by_ratio_sqr / total_pop_zip)

ac_sd_visits_cases_zip_by_date_w_weighted <- left_join(ac_sd_visits_cases_zip_by_date, ac_daily_visits_zip_413_524_weights) %>% 
  mutate(weighted_visits_per_capita = total_weighted_visits_by_ratio / total_pop_zip, weighted_visits_ratio_sqr_per_capita = total_weighted_visits_by_ratio_sqr / total_pop_zip)

sf_sd_visits_cases_zip_by_date_w_weighted %>% plot_ly() %>%
  add_trace(x = ~weighted_visits_per_capita, y = ~visits_per_capita, type = 'scatter', mode = 'markers') %>%
  layout(title = "San Francisco County")
ac_sd_visits_cases_zip_by_date_w_weighted %>% plot_ly() %>%
  add_trace(x = ~weighted_visits_per_capita, y = ~visits_per_capita, type = 'scatter', mode = 'markers') %>%
  layout(title = "Alameda County")

Now look at with cases. Again doing change in cases 4/21-present and visits since 4/16 through 5/24.

# SF first
# summarize and add current and initial cases
sf_sd_visits_cases_zip_change_w_weighted <- sf_sd_visits_cases_zip_by_date_w_weighted %>% 
  filter(date >= visits_start_date & date <= visits_end_date) %>%
  filter(!is.na(zipcode) & !is.na(total_visits_high) & !is.na(total_weighted_visits_by_ratio)) %>%
  group_by(zipcode) %>%
  summarize(total_visits_high = sum(total_visits_high), 
            total_visits_low = sum(total_visits_low), 
            total_weighted_visits = sum(total_weighted_visits_by_ratio),
            total_weighted_visits_by_ratio_sqr = sum(total_weighted_visits_by_ratio_sqr)) %>%
  left_join(sf_cases_zip %>% filter(date == max(date)) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
  mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
         visits_per_capita = total_visits_avg / total_pop_zip,
         weighted_visits_per_capita = total_weighted_visits / total_pop_zip,
         weighted_visits_per_capita_ratio_sqr = total_weighted_visits_by_ratio_sqr / total_pop_zip) %>%
  filter(!is.na(cases_by_pop)) %>%
  left_join(sf_init_cases) %>%
  mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
         change_cases_by_pop = cases_by_pop - init_cases_by_pop)
  
# Alameda
# get initial cases
ac_sd_visits_cases_zip_change_w_weighted <- ac_sd_visits_cases_zip_by_date_w_weighted %>% 
  filter(date >= visits_start_date & date <= visits_end_date) %>%
  filter(!is.na(zipcode) & !is.na(total_visits_high) & !is.na(total_weighted_visits_by_ratio)) %>%
  group_by(zipcode) %>%
  summarize(total_visits_high = sum(total_visits_high), 
            total_visits_low = sum(total_visits_low), 
            total_weighted_visits = sum(total_weighted_visits_by_ratio),
            total_weighted_visits_by_ratio_sqr = sum(total_weighted_visits_by_ratio_sqr)) %>%
  left_join(ac_cases_zip %>% filter(date == max(date)) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
  mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
         visits_per_capita = total_visits_avg / total_pop_zip,
         weighted_visits_per_capita = total_weighted_visits / total_pop_zip,
         weighted_visits_per_capita_ratio_sqr = total_weighted_visits_by_ratio_sqr / total_pop_zip) %>%
  filter(!is.na(cases_by_pop)) %>%
  left_join(ac_init_cases) %>%
  mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
         change_cases_by_pop = cases_by_pop - init_cases_by_pop)

# plot
# San Francisco
sf_sd_visits_cases_zip_change_w_weighted %>%
  plot_ly() %>%
  add_trace(x = ~weighted_visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~weighted_visits_per_capita, y = fitted(lm(change_cases_by_pop ~ weighted_visits_per_capita, sf_sd_visits_cases_zip_change_w_weighted)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total weighted (with simple ratio) visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in cases per person from 4/21-present'), title = "San Francisco visits/person and change in cases/person, weighted with simple ratio")
model_sf_visits_cases_change_weighted <- lm(change_cases_by_pop ~ weighted_visits_per_capita, sf_sd_visits_cases_zip_change_w_weighted)

summary(model_sf_visits_cases_change_weighted)
## 
## Call:
## lm(formula = change_cases_by_pop ~ weighted_visits_per_capita, 
##     data = sf_sd_visits_cases_zip_change_w_weighted)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0011003 -0.0006846 -0.0004876  0.0006605  0.0018655 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)
## (Intercept)                0.0002314  0.0006827   0.339    0.738
## weighted_visits_per_capita 0.0035985  0.0033202   1.084    0.290
## 
## Residual standard error: 0.0009355 on 23 degrees of freedom
## Multiple R-squared:  0.04859,    Adjusted R-squared:  0.007226 
## F-statistic: 1.175 on 1 and 23 DF,  p-value: 0.2897
sf_sd_visits_cases_zip_change_w_weighted %>%
  plot_ly() %>%
  add_trace(x = ~weighted_visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~weighted_visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ weighted_visits_per_capita, sf_sd_visits_cases_zip_change_w_weighted)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total weighted (with simple ratio) visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in log(cases per person) from 4/21-present'), title = "San Francisco visits/person and change in cases/person, weighted with simple ratio")
model_sf_visits_cases_change_weighted_log <- lm(change_log_cases_by_pop ~ weighted_visits_per_capita, sf_sd_visits_cases_zip_change_w_weighted)

summary(model_sf_visits_cases_change_weighted_log)
## 
## Call:
## lm(formula = change_log_cases_by_pop ~ weighted_visits_per_capita, 
##     data = sf_sd_visits_cases_zip_change_w_weighted)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.54716 -0.27788 -0.06889  0.13175  1.98223 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                  0.2610     0.3501   0.746    0.463
## weighted_visits_per_capita   1.3633     1.7024   0.801    0.431
## 
## Residual standard error: 0.4797 on 23 degrees of freedom
## Multiple R-squared:  0.02712,    Adjusted R-squared:  -0.01517 
## F-statistic: 0.6413 on 1 and 23 DF,  p-value: 0.4314

Again removing the weird zip code, 94108.

sf_sd_visits_cases_zip_change_w_weighted_edit <- sf_sd_visits_cases_zip_change_w_weighted %>% filter(zipcode != "94108")

sf_sd_visits_cases_zip_change_w_weighted_edit %>%
  plot_ly() %>%
  add_trace(x = ~weighted_visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~weighted_visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ weighted_visits_per_capita, sf_sd_visits_cases_zip_change_w_weighted_edit)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total weighted (with simple ratio) visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in log(cases per person) from 4/21-present'), title = "San Francisco visits/person and change in cases/person, weighted with simple ratio")
model_sf_visits_cases_change_weighted_log_2 <- lm(change_log_cases_by_pop ~ weighted_visits_per_capita, sf_sd_visits_cases_zip_change_w_weighted_edit)

summary(model_sf_visits_cases_change_weighted_log_2)
## 
## Call:
## lm(formula = change_log_cases_by_pop ~ weighted_visits_per_capita, 
##     data = sf_sd_visits_cases_zip_change_w_weighted_edit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49973 -0.17919  0.00321  0.18088  0.47388 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                 0.07202    0.17017   0.423   0.6763  
## weighted_visits_per_capita  1.89901    0.82316   2.307   0.0309 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2313 on 22 degrees of freedom
## Multiple R-squared:  0.1948, Adjusted R-squared:  0.1582 
## F-statistic: 5.322 on 1 and 22 DF,  p-value: 0.03085

At least more significant.

Compare to using the square of the ratio:

sf_sd_visits_cases_zip_change_w_weighted_edit %>%
  plot_ly() %>%
  add_trace(x = ~weighted_visits_per_capita_ratio_sqr, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~weighted_visits_per_capita_ratio_sqr, y = fitted(lm(change_log_cases_by_pop ~ weighted_visits_per_capita_ratio_sqr, sf_sd_visits_cases_zip_change_w_weighted_edit)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total weighted (with square of ratio) visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in log(cases per person) from 4/21-present'), title = "San Francisco visits/person and change in cases/person, weighted with square of ratio")
model_sf_visits_cases_change_weighted_sqr_log <- lm(change_log_cases_by_pop ~ weighted_visits_per_capita_ratio_sqr, sf_sd_visits_cases_zip_change_w_weighted_edit)

summary(model_sf_visits_cases_change_weighted_sqr_log)
## 
## Call:
## lm(formula = change_log_cases_by_pop ~ weighted_visits_per_capita_ratio_sqr, 
##     data = sf_sd_visits_cases_zip_change_w_weighted_edit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50400 -0.17484  0.00142  0.21739  0.43481 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                            0.2784     0.1318   2.112   0.0463 *
## weighted_visits_per_capita_ratio_sqr  17.5128    12.4946   1.402   0.1750  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.247 on 22 degrees of freedom
## Multiple R-squared:  0.08198,    Adjusted R-squared:  0.04025 
## F-statistic: 1.965 on 1 and 22 DF,  p-value: 0.175

Less significant with the square of the ratio.

Try Alameda.

ac_sd_visits_cases_zip_change_w_weighted %>%
  plot_ly() %>%
  add_trace(x = ~weighted_visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~weighted_visits_per_capita, y = fitted(lm(change_cases_by_pop ~ weighted_visits_per_capita, ac_sd_visits_cases_zip_change_w_weighted)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total weighted (with simple ratio) visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in cases per person from 4/21-present'), title = "Alameda visits/person and change in cases/person, weighted with simple ratio")
model_ac_visits_cases_change_weighted <- lm(change_cases_by_pop ~ weighted_visits_per_capita, ac_sd_visits_cases_zip_change_w_weighted)

summary(model_ac_visits_cases_change_weighted)
## 
## Call:
## lm(formula = change_cases_by_pop ~ weighted_visits_per_capita, 
##     data = ac_sd_visits_cases_zip_change_w_weighted)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0019119 -0.0004788 -0.0001427  0.0001931  0.0039120 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -0.0017986  0.0007617  -2.361 0.022708 *  
## weighted_visits_per_capita  0.0166457  0.0044260   3.761 0.000496 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001036 on 44 degrees of freedom
## Multiple R-squared:  0.2433, Adjusted R-squared:  0.2261 
## F-statistic: 14.14 on 1 and 44 DF,  p-value: 0.0004964
ac_sd_visits_cases_zip_change_w_weighted %>%
  plot_ly() %>%
  add_trace(x = ~weighted_visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~weighted_visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ weighted_visits_per_capita, ac_sd_visits_cases_zip_change_w_weighted)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total weighted (with simple ratio) visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in log(cases per person) from 4/21-present'), title = "Alameda visits/person and change in cases/person, weighted with simple ratio")
model_ac_visits_cases_change_weighted_log <- lm(change_log_cases_by_pop ~ weighted_visits_per_capita, ac_sd_visits_cases_zip_change_w_weighted)

summary(model_ac_visits_cases_change_weighted_log)
## 
## Call:
## lm(formula = change_log_cases_by_pop ~ weighted_visits_per_capita, 
##     data = ac_sd_visits_cases_zip_change_w_weighted)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.05605 -0.18465  0.05513  0.18785  0.83029 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -0.2877     0.2787  -1.032 0.307550    
## weighted_visits_per_capita   6.0282     1.6193   3.723 0.000557 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3791 on 44 degrees of freedom
## Multiple R-squared:  0.2395, Adjusted R-squared:  0.2222 
## F-statistic: 13.86 on 1 and 44 DF,  p-value: 0.0005571

Interesting–the weighting makes the Alameda results more significant than without the weighting, but that isn’t true in SF.

Try with square of ratio.

ac_sd_visits_cases_zip_change_w_weighted %>%
  plot_ly() %>%
  add_trace(x = ~weighted_visits_per_capita_ratio_sqr, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
  add_trace(x = ~weighted_visits_per_capita_ratio_sqr, y = fitted(lm(change_log_cases_by_pop ~ weighted_visits_per_capita_ratio_sqr, ac_sd_visits_cases_zip_change_w_weighted)), mode = 'lines', showlegend = F) %>%
  layout(xaxis = list(title = 'Total weighted (with square of ratio) visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in log(cases per person) from 4/21-present'), title = "Alameda visits/person and change in cases/person, weighted with square of ratio")
model_ac_visits_cases_change_weighted_sqr_log <- lm(change_log_cases_by_pop ~ weighted_visits_per_capita_ratio_sqr, ac_sd_visits_cases_zip_change_w_weighted)

summary(model_ac_visits_cases_change_weighted_sqr_log)
## 
## Call:
## lm(formula = change_log_cases_by_pop ~ weighted_visits_per_capita_ratio_sqr, 
##     data = ac_sd_visits_cases_zip_change_w_weighted)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7261 -0.2718 -0.0700  0.2180  1.2130 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            0.8266     0.1147   7.209 5.65e-09 ***
## weighted_visits_per_capita_ratio_sqr -13.0063    12.6949  -1.025    0.311    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4296 on 44 degrees of freedom
## Multiple R-squared:  0.0233, Adjusted R-squared:  0.001102 
## F-statistic:  1.05 on 1 and 44 DF,  p-value: 0.3112

Again not significant with square of ratio.